# this will help in making the Python code more structured automatically
%load_ext nb_black
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)
# to scale the data using z-score
from sklearn.preprocessing import StandardScaler
# to compute distances
from scipy.spatial.distance import cdist
# to perform k-means clustering and compute silhouette scores
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# to visualize the elbow curve and silhouette scores
# from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from scipy.spatial.distance import pdist
# to perform hierarchical clustering, compute cophenetic correlation, and create dendrograms
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet
The nb_black extension is already loaded. To reload it, use: %reload_ext nb_black
url = "stock_data.csv"
stockData = pd.read_csv(url)
# creditData = pd.read_csv("credit.csv")
stockData.head(10) # several missing values!
| Ticker Symbol | Security | GICS Sector | GICS Sub Industry | Current Price | Price Change | Volatility | ROE | Cash Ratio | Net Cash Flow | Net Income | Earnings Per Share | Estimated Shares Outstanding | P/E Ratio | P/B Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AAL | American Airlines Group | Industrials | Airlines | 42.349998 | 9.999995 | 1.687151 | 135 | 51 | -604000000 | 7610000000 | 11.39 | 6.681299e+08 | 3.718174 | -8.784219 |
| 1 | ABBV | AbbVie | Health Care | Pharmaceuticals | 59.240002 | 8.339433 | 2.197887 | 130 | 77 | 51000000 | 5144000000 | 3.15 | 1.633016e+09 | 18.806350 | -8.750068 |
| 2 | ABT | Abbott Laboratories | Health Care | Health Care Equipment | 44.910000 | 11.301121 | 1.273646 | 21 | 67 | 938000000 | 4423000000 | 2.94 | 1.504422e+09 | 15.275510 | -0.394171 |
| 3 | ADBE | Adobe Systems Inc | Information Technology | Application Software | 93.940002 | 13.977195 | 1.357679 | 9 | 180 | -240840000 | 629551000 | 1.26 | 4.996437e+08 | 74.555557 | 4.199651 |
| 4 | ADI | Analog Devices, Inc. | Information Technology | Semiconductors | 55.320000 | -1.827858 | 1.701169 | 14 | 272 | 315120000 | 696878000 | 0.31 | 2.247994e+09 | 178.451613 | 1.059810 |
| 5 | ADM | Archer-Daniels-Midland Co | Consumer Staples | Agricultural Products | 36.680000 | -12.017268 | 1.516493 | 10 | 49 | -189000000 | 1849000000 | 2.99 | 6.183946e+08 | 12.267559 | 7.496831 |
| 6 | ADS | Alliance Data Systems | Information Technology | Data Processing & Outsourced Services | 276.570007 | 6.189286 | 1.116976 | 30 | 25 | 90885000 | 596541000 | 8.91 | 6.695185e+07 | 31.040405 | 129.064585 |
| 7 | AEE | Ameren Corp | Utilities | MultiUtilities | 43.230000 | 2.174424 | 1.124186 | 9 | 14 | 287000000 | 636000000 | 2.60 | 2.446154e+08 | 16.626923 | -0.719497 |
| 8 | AEP | American Electric Power | Utilities | Electric Utilities | 58.270000 | 2.371753 | 1.068485 | 11 | 9 | 13900000 | 2052300000 | 3.13 | 4.218978e+08 | 18.456543 | -3.022649 |
| 9 | AFL | AFLAC Inc | Financials | Life & Health Insurance | 59.900002 | 3.027181 | 1.048295 | 14 | 99 | -308000000 | 2533000000 | 5.88 | 4.307823e+08 | 10.187075 | -1.883912 |
# copying data to another data frame to avaoid changes in the original data
data_new = stockData.copy()
data_new.shape
(340, 15)
The Dataset has 340 rows and 15 columns.
# Selecting duplicate rows except first
# occurrence based on all columns
duplicate = data_new[data_new.duplicated()]
duplicate
| Ticker Symbol | Security | GICS Sector | GICS Sub Industry | Current Price | Price Change | Volatility | ROE | Cash Ratio | Net Cash Flow | Net Income | Earnings Per Share | Estimated Shares Outstanding | P/E Ratio | P/B Ratio |
|---|
There are no duplicate rows.
data_new.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 340 entries, 0 to 339 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Ticker Symbol 340 non-null object 1 Security 340 non-null object 2 GICS Sector 340 non-null object 3 GICS Sub Industry 340 non-null object 4 Current Price 340 non-null float64 5 Price Change 340 non-null float64 6 Volatility 340 non-null float64 7 ROE 340 non-null int64 8 Cash Ratio 340 non-null int64 9 Net Cash Flow 340 non-null int64 10 Net Income 340 non-null int64 11 Earnings Per Share 340 non-null float64 12 Estimated Shares Outstanding 340 non-null float64 13 P/E Ratio 340 non-null float64 14 P/B Ratio 340 non-null float64 dtypes: float64(7), int64(4), object(4) memory usage: 40.0+ KB
data_new.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Current Price | 340.0 | 8.086234e+01 | 9.805509e+01 | 4.500000e+00 | 3.855500e+01 | 5.970500e+01 | 9.288000e+01 | 1.274950e+03 |
| Price Change | 340.0 | 4.078194e+00 | 1.200634e+01 | -4.712969e+01 | -9.394838e-01 | 4.819505e+00 | 1.069549e+01 | 5.505168e+01 |
| Volatility | 340.0 | 1.525976e+00 | 5.917984e-01 | 7.331632e-01 | 1.134878e+00 | 1.385593e+00 | 1.695549e+00 | 4.580042e+00 |
| ROE | 340.0 | 3.959706e+01 | 9.654754e+01 | 1.000000e+00 | 9.750000e+00 | 1.500000e+01 | 2.700000e+01 | 9.170000e+02 |
| Cash Ratio | 340.0 | 7.002353e+01 | 9.042133e+01 | 0.000000e+00 | 1.800000e+01 | 4.700000e+01 | 9.900000e+01 | 9.580000e+02 |
| Net Cash Flow | 340.0 | 5.553762e+07 | 1.946365e+09 | -1.120800e+10 | -1.939065e+08 | 2.098000e+06 | 1.698108e+08 | 2.076400e+10 |
| Net Income | 340.0 | 1.494385e+09 | 3.940150e+09 | -2.352800e+10 | 3.523012e+08 | 7.073360e+08 | 1.899000e+09 | 2.444200e+10 |
| Earnings Per Share | 340.0 | 2.776662e+00 | 6.587779e+00 | -6.120000e+01 | 1.557500e+00 | 2.895000e+00 | 4.620000e+00 | 5.009000e+01 |
| Estimated Shares Outstanding | 340.0 | 5.770283e+08 | 8.458496e+08 | 2.767216e+07 | 1.588482e+08 | 3.096751e+08 | 5.731175e+08 | 6.159292e+09 |
| P/E Ratio | 340.0 | 3.261256e+01 | 4.434873e+01 | 2.935451e+00 | 1.504465e+01 | 2.081988e+01 | 3.176476e+01 | 5.280391e+02 |
| P/B Ratio | 340.0 | -1.718249e+00 | 1.396691e+01 | -7.611908e+01 | -4.352056e+00 | -1.067170e+00 | 3.917066e+00 | 1.290646e+02 |
data_new.describe(exclude="number").T
| count | unique | top | freq | |
|---|---|---|---|---|
| Ticker Symbol | 340 | 340 | MAT | 1 |
| Security | 340 | 340 | Philip Morris International | 1 |
| GICS Sector | 340 | 11 | Industrials | 53 |
| GICS Sub Industry | 340 | 104 | Oil & Gas Exploration & Production | 16 |
data_new.isna().sum()
Ticker Symbol 0 Security 0 GICS Sector 0 GICS Sub Industry 0 Current Price 0 Price Change 0 Volatility 0 ROE 0 Cash Ratio 0 Net Cash Flow 0 Net Income 0 Earnings Per Share 0 Estimated Shares Outstanding 0 P/E Ratio 0 P/B Ratio 0 dtype: int64
There are no missing values in the data provided.
category = ["Ticker Symbol", "Security", "GICS Sector", "GICS Sub Industry"]
for column in category:
print(data_new[column].value_counts())
print("_" * 40)
MAT 1
UTX 1
AJG 1
MTB 1
ALL 1
..
AXP 1
ISRG 1
C 1
BLL 1
DLPH 1
Name: Ticker Symbol, Length: 340, dtype: int64
________________________________________
Philip Morris International 1
SCANA Corp 1
Genuine Parts 1
Newell Brands 1
United Technologies 1
..
Apache Corporation 1
CME Group Inc. 1
PPL Corp. 1
PepsiCo Inc. 1
Western Union Co 1
Name: Security, Length: 340, dtype: int64
________________________________________
Industrials 53
Financials 49
Health Care 40
Consumer Discretionary 40
Information Technology 33
Energy 30
Real Estate 27
Utilities 24
Materials 20
Consumer Staples 19
Telecommunications Services 5
Name: GICS Sector, dtype: int64
________________________________________
Oil & Gas Exploration & Production 16
Industrial Conglomerates 14
REITs 14
Electric Utilities 12
Internet Software & Services 12
Health Care Equipment 11
MultiUtilities 11
Banks 10
Property & Casualty Insurance 8
Diversified Financial Services 7
Biotechnology 7
Pharmaceuticals 6
Oil & Gas Refining & Marketing & Transportation 6
Packaged Foods & Meats 6
Semiconductors 6
Diversified Chemicals 5
Managed Health Care 5
Health Care Facilities 5
Consumer Finance 5
Airlines 5
Industrial Machinery 5
Integrated Oil & Gas 5
Integrated Telecommunications Services 4
Hotels, Resorts & Cruise Lines 4
Aerospace & Defense 4
Building Products 4
Retail REITs 4
Internet & Direct Marketing Retail 4
Railroads 4
Asset Management & Custody Banks 4
Soft Drinks 4
Research & Consulting Services 4
Residential REITs 4
Specialty Chemicals 4
Regional Banks 3
Oil & Gas Equipment & Services 3
Specialty Stores 3
Cable & Satellite 3
Life & Health Insurance 3
Air Freight & Logistics 3
Restaurants 3
Household Products 3
Specialized REITs 3
Insurance Brokers 3
IT Consulting & Other Services 3
Health Care Distributors 3
Construction & Farm Machinery & Heavy Trucks 3
Leisure Products 2
Broadcasting & Cable TV 2
Auto Parts & Equipment 2
Investment Banking & Brokerage 2
Tobacco 2
Data Processing & Outsourced Services 2
Construction Materials 2
Paper Packaging 2
Automobile Manufacturers 2
Application Software 2
Health Care Supplies 2
Fertilizers & Agricultural Chemicals 2
Advertising 2
Homebuilding 2
Electronic Components 2
Water Utilities 1
Electrical Components & Equipment 1
Multi-Sector Holdings 1
Thrifts & Mortgage Finance 1
Steel 1
Motorcycle Manufacturers 1
Personal Products 1
Copper 1
Technology Hardware, Storage & Peripherals 1
Financial Exchanges & Data 1
Gold 1
Tires & Rubber 1
Apparel, Accessories & Luxury Goods 1
Computer Hardware 1
Semiconductor Equipment 1
Real Estate Services 1
Office REITs 1
Life Sciences Tools & Services 1
Human Resource & Employment Services 1
Diversified Commercial Services 1
Agricultural Products 1
Alternative Carriers 1
Drug Retail 1
Specialty Retail 1
Networking Equipment 1
Casinos & Gaming 1
Household Appliances 1
Distributors 1
Housewares & Specialties 1
Industrial Materials 1
Publishing 1
Brewers 1
Multi-line Insurance 1
Consumer Electronics 1
Trucking 1
Environmental Services 1
Metal & Glass Containers 1
Home Furnishings 1
Home Entertainment Software 1
Technology, Hardware, Software and Supplies 1
Electronic Equipment & Instruments 1
Industrial Gases 1
Name: GICS Sub Industry, dtype: int64
________________________________________
data_new.head()
| Ticker Symbol | Security | GICS Sector | GICS Sub Industry | Current Price | Price Change | Volatility | ROE | Cash Ratio | Net Cash Flow | Net Income | Earnings Per Share | Estimated Shares Outstanding | P/E Ratio | P/B Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AAL | American Airlines Group | Industrials | Airlines | 42.349998 | 9.999995 | 1.687151 | 135 | 51 | -604000000 | 7610000000 | 11.39 | 6.681299e+08 | 3.718174 | -8.784219 |
| 1 | ABBV | AbbVie | Health Care | Pharmaceuticals | 59.240002 | 8.339433 | 2.197887 | 130 | 77 | 51000000 | 5144000000 | 3.15 | 1.633016e+09 | 18.806350 | -8.750068 |
| 2 | ABT | Abbott Laboratories | Health Care | Health Care Equipment | 44.910000 | 11.301121 | 1.273646 | 21 | 67 | 938000000 | 4423000000 | 2.94 | 1.504422e+09 | 15.275510 | -0.394171 |
| 3 | ADBE | Adobe Systems Inc | Information Technology | Application Software | 93.940002 | 13.977195 | 1.357679 | 9 | 180 | -240840000 | 629551000 | 1.26 | 4.996437e+08 | 74.555557 | 4.199651 |
| 4 | ADI | Analog Devices, Inc. | Information Technology | Semiconductors | 55.320000 | -1.827858 | 1.701169 | 14 | 272 | 315120000 | 696878000 | 0.31 | 2.247994e+09 | 178.451613 | 1.059810 |
print(data_new.Security.unique())
['American Airlines Group' 'AbbVie' 'Abbott Laboratories' 'Adobe Systems Inc' 'Analog Devices, Inc.' 'Archer-Daniels-Midland Co' 'Alliance Data Systems' 'Ameren Corp' 'American Electric Power' 'AFLAC Inc' 'American International Group, Inc.' 'Apartment Investment & Mgmt' 'Assurant Inc' 'Arthur J. Gallagher & Co.' 'Akamai Technologies Inc' 'Albemarle Corp' 'Alaska Air Group Inc' 'Allstate Corp' 'Allegion' 'Alexion Pharmaceuticals' 'Applied Materials Inc' 'AMETEK Inc' 'Affiliated Managers Group Inc' 'Amgen Inc' 'Ameriprise Financial' 'American Tower Corp A' 'Amazon.com Inc' 'AutoNation Inc' 'Anthem Inc.' 'Aon plc' 'Apache Corporation' 'Anadarko Petroleum Corp' 'Amphenol Corp' 'Arconic Inc' 'Activision Blizzard' 'AvalonBay Communities, Inc.' 'Broadcom' 'American Water Works Company Inc' 'American Express Co' 'Boeing Company' 'Bank of America Corp' 'Baxter International Inc.' 'BB&T Corporation' 'Bard (C.R.) Inc.' 'Baker Hughes Inc' 'BIOGEN IDEC Inc.' 'The Bank of New York Mellon Corp.' 'Ball Corp' 'Bristol-Myers Squibb' 'Boston Scientific' 'BorgWarner' 'Boston Properties' 'Citigroup Inc.' 'Caterpillar Inc.' 'Chubb Limited' 'CBRE Group' 'Crown Castle International Corp.' 'Carnival Corp.' 'Celgene Corp.' 'CF Industries Holdings Inc' 'Citizens Financial Group' 'Church & Dwight' 'Chesapeake Energy' 'C. H. Robinson Worldwide' 'Charter Communications' 'CIGNA Corp.' 'Cincinnati Financial' 'Colgate-Palmolive' 'Comerica Inc.' 'CME Group Inc.' 'Chipotle Mexican Grill' 'Cummins Inc.' 'CMS Energy' 'Centene Corporation' 'CenterPoint Energy' 'Capital One Financial' 'Cabot Oil & Gas' 'The Cooper Companies' 'CSX Corp.' 'CenturyLink Inc' 'Cognizant Technology Solutions' 'Citrix Systems' 'CVS Health' 'Chevron Corp.' 'Concho Resources' 'Dominion Resources' 'Delta Air Lines' 'Du Pont (E.I.)' 'Deere & Co.' 'Discover Financial Services' 'Quest Diagnostics' 'Danaher Corp.' 'The Walt Disney Company' 'Discovery Communications-A' 'Discovery Communications-C' 'Delphi Automotive' 'Digital Realty Trust' 'Dun & Bradstreet' 'Dover Corp.' 'Dr Pepper Snapple Group' 'Duke Energy' 'DaVita Inc.' 'Devon Energy Corp.' 'eBay Inc.' 'Ecolab Inc.' 'Consolidated Edison' 'Equifax Inc.' "Edison Int'l" 'Eastman Chemical' 'EOG Resources' 'Equinix' 'Equity Residential' 'EQT Corporation' 'Eversource Energy' 'Essex Property Trust, Inc.' 'E*Trade' 'Eaton Corporation' 'Entergy Corp.' 'Edwards Lifesciences' 'Exelon Corp.' "Expeditors Int'l" 'Expedia Inc.' 'Extra Space Storage' 'Ford Motor' 'Fastenal Co' 'Facebook' 'Fortune Brands Home & Security' 'Freeport-McMoran Cp & Gld' 'FirstEnergy Corp' 'Fidelity National Information Services' 'Fiserv Inc' 'FLIR Systems' 'Fluor Corp.' 'Flowserve Corporation' 'FMC Corporation' 'Federal Realty Investment Trust' 'First Solar Inc' 'Frontier Communications' 'General Dynamics' 'General Growth Properties Inc.' 'Gilead Sciences' 'Corning Inc.' 'General Motors' 'Genuine Parts' 'Garmin Ltd.' 'Goodyear Tire & Rubber' 'Grainger (W.W.) Inc.' 'Halliburton Co.' 'Hasbro Inc.' 'Huntington Bancshares' 'HCA Holdings' 'Welltower Inc.' 'HCP Inc.' 'Hess Corporation' 'Hartford Financial Svc.Gp.' 'Harley-Davidson' "Honeywell Int'l Inc." 'Hewlett Packard Enterprise' 'HP Inc.' 'Hormel Foods Corp.' 'Henry Schein' 'Host Hotels & Resorts' 'The Hershey Company' 'Humana Inc.' 'International Business Machines' 'IDEXX Laboratories' 'Intl Flavors & Fragrances' 'Intel Corp.' 'International Paper' 'Interpublic Group' 'Iron Mountain Incorporated' 'Intuitive Surgical Inc.' 'Illinois Tool Works' 'Invesco Ltd.' 'J. B. Hunt Transport Services' 'Jacobs Engineering Group' 'Juniper Networks' 'JPMorgan Chase & Co.' 'Kimco Realty' 'Kimberly-Clark' 'Kinder Morgan' 'Coca Cola Company' 'Kansas City Southern' 'Leggett & Platt' 'Lennar Corp.' 'Laboratory Corp. of America Holding' 'LKQ Corporation' 'L-3 Communications Holdings' 'Lilly (Eli) & Co.' 'Lockheed Martin Corp.' 'Alliant Energy Corp' 'Leucadia National Corp.' 'Southwest Airlines' 'Level 3 Communications' 'LyondellBasell' 'Mastercard Inc.' 'Mid-America Apartments' 'Macerich' "Marriott Int'l." 'Masco Corp.' 'Mattel Inc.' "McDonald's Corp." "Moody's Corp" 'Mondelez International' 'MetLife Inc.' 'Mohawk Industries' 'Mead Johnson' 'McCormick & Co.' 'Martin Marietta Materials' 'Marsh & McLennan' '3M Company' 'Monster Beverage' 'Altria Group Inc' 'The Mosaic Company' 'Marathon Petroleum' 'Merck & Co.' 'Marathon Oil Corp.' 'M&T Bank Corp.' 'Mettler Toledo' 'Murphy Oil' 'Mylan N.V.' 'Navient' 'Noble Energy Inc' 'NASDAQ OMX Group' 'NextEra Energy' 'Newmont Mining Corp. (Hldg. Co.)' 'Netflix Inc.' 'Newfield Exploration Co' 'Nielsen Holdings' 'National Oilwell Varco Inc.' 'Norfolk Southern Corp.' 'Northern Trust Corp.' 'Nucor Corp.' 'Newell Brands' 'Realty Income Corporation' 'ONEOK' 'Omnicom Group' "O'Reilly Automotive" 'Occidental Petroleum' "People's United Financial" 'Pitney-Bowes' 'PACCAR Inc.' 'PG&E Corp.' 'Priceline.com Inc' 'Public Serv. Enterprise Inc.' 'PepsiCo Inc.' 'Pfizer Inc.' 'Principal Financial Group' 'Procter & Gamble' 'Progressive Corp.' 'Pulte Homes Inc.' 'Philip Morris International' 'PNC Financial Services' 'Pentair Ltd.' 'Pinnacle West Capital' 'PPG Industries' 'PPL Corp.' 'Prudential Financial' 'Phillips 66' 'Quanta Services Inc.' 'Praxair Inc.' 'PayPal' 'Ryder System' 'Royal Caribbean Cruises Ltd' 'Regeneron' 'Robert Half International' 'Roper Industries' 'Range Resources Corp.' 'Republic Services Inc' 'SCANA Corp' 'Charles Schwab Corporation' 'Spectra Energy Corp.' 'Sealed Air' 'Sherwin-Williams' 'SL Green Realty' 'Scripps Networks Interactive Inc.' 'Southern Co.' 'Simon Property Group Inc' 'S&P Global, Inc.' 'Stericycle Inc' 'Sempra Energy' 'SunTrust Banks' 'State Street Corp.' 'Skyworks Solutions' 'Southwestern Energy' 'Synchrony Financial' 'Stryker Corp.' 'AT&T Inc' 'Molson Coors Brewing Company' 'Teradata Corp.' 'Tegna, Inc.' 'Torchmark Corp.' 'Thermo Fisher Scientific' 'TripAdvisor' 'The Travelers Companies Inc.' 'Tractor Supply Company' 'Tyson Foods' 'Tesoro Petroleum Co.' 'Total System Services' 'Texas Instruments' 'Under Armour' 'United Continental Holdings' 'UDR Inc' 'Universal Health Services, Inc.' 'United Health Group Inc.' 'Unum Group' 'Union Pacific' 'United Parcel Service' 'United Technologies' 'Varian Medical Systems' 'Valero Energy' 'Vulcan Materials' 'Vornado Realty Trust' 'Verisk Analytics' 'Verisign Inc.' 'Vertex Pharmaceuticals Inc' 'Ventas Inc' 'Verizon Communications' 'Waters Corporation' 'Wec Energy Group Inc' 'Wells Fargo' 'Whirlpool Corp.' 'Waste Management Inc.' 'Williams Cos.' 'Western Union Co' 'Weyerhaeuser Corp.' 'Wyndham Worldwide' 'Wynn Resorts Ltd' 'Cimarex Energy' 'Xcel Energy Inc' 'XL Capital' 'Exxon Mobil Corp.' 'Dentsply Sirona' 'Xerox Corp.' 'Xylem Inc.' 'Yahoo Inc.' 'Yum! Brands Inc' 'Zimmer Biomet Holdings' 'Zions Bancorp' 'Zoetis']
print(data_new["GICS Sector"].unique())
['Industrials' 'Health Care' 'Information Technology' 'Consumer Staples' 'Utilities' 'Financials' 'Real Estate' 'Materials' 'Consumer Discretionary' 'Energy' 'Telecommunications Services']
print(data_new["GICS Sub Industry"].unique())
['Airlines' 'Pharmaceuticals' 'Health Care Equipment' 'Application Software' 'Semiconductors' 'Agricultural Products' 'Data Processing & Outsourced Services' 'MultiUtilities' 'Electric Utilities' 'Life & Health Insurance' 'Property & Casualty Insurance' 'REITs' 'Multi-line Insurance' 'Insurance Brokers' 'Internet Software & Services' 'Specialty Chemicals' 'Building Products' 'Biotechnology' 'Semiconductor Equipment' 'Electrical Components & Equipment' 'Asset Management & Custody Banks' 'Specialized REITs' 'Internet & Direct Marketing Retail' 'Specialty Stores' 'Managed Health Care' 'Oil & Gas Exploration & Production' 'Electronic Components' 'Aerospace & Defense' 'Home Entertainment Software' 'Residential REITs' 'Water Utilities' 'Consumer Finance' 'Banks' 'Oil & Gas Equipment & Services' 'Metal & Glass Containers' 'Health Care Distributors' 'Auto Parts & Equipment' 'Construction & Farm Machinery & Heavy Trucks' 'Real Estate Services' 'Hotels, Resorts & Cruise Lines' 'Fertilizers & Agricultural Chemicals' 'Regional Banks' 'Household Products' 'Integrated Oil & Gas' 'Air Freight & Logistics' 'Cable & Satellite' 'Financial Exchanges & Data' 'Restaurants' 'Industrial Machinery' 'Health Care Supplies' 'Railroads' 'Integrated Telecommunications Services' 'IT Consulting & Other Services' 'Drug Retail' 'Diversified Chemicals' 'Health Care Facilities' 'Industrial Conglomerates' 'Broadcasting & Cable TV' 'Research & Consulting Services' 'Soft Drinks' 'Investment Banking & Brokerage' 'Automobile Manufacturers' 'Copper' 'Electronic Equipment & Instruments' 'Diversified Commercial Services' 'Retail REITs' 'Consumer Electronics' 'Tires & Rubber' 'Industrial Materials' 'Leisure Products' 'Motorcycle Manufacturers' 'Technology Hardware, Storage & Peripherals' 'Computer Hardware' 'Packaged Foods & Meats' 'Paper Packaging' 'Advertising' 'Trucking' 'Networking Equipment' 'Oil & Gas Refining & Marketing & Transportation' 'Homebuilding' 'Distributors' 'Multi-Sector Holdings' 'Alternative Carriers' 'Diversified Financial Services' 'Home Furnishings' 'Construction Materials' 'Tobacco' 'Life Sciences Tools & Services' 'Gold' 'Steel' 'Housewares & Specialties' 'Thrifts & Mortgage Finance' 'Technology, Hardware, Software and Supplies' 'Personal Products' 'Industrial Gases' 'Human Resource & Employment Services' 'Office REITs' 'Brewers' 'Publishing' 'Specialty Retail' 'Apparel, Accessories & Luxury Goods' 'Household Appliances' 'Environmental Services' 'Casinos & Gaming']
data_new["Ticker Symbol"] = data_new["Ticker Symbol"].astype("category")
data_new.Security = data_new.Security.astype("category")
data_new["GICS Sector"] = data_new["GICS Sector"].astype("category")
data_new["GICS Sub Industry"] = data_new["GICS Sub Industry"].astype("category")
data_new.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 340 entries, 0 to 339 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Ticker Symbol 340 non-null category 1 Security 340 non-null category 2 GICS Sector 340 non-null category 3 GICS Sub Industry 340 non-null category 4 Current Price 340 non-null float64 5 Price Change 340 non-null float64 6 Volatility 340 non-null float64 7 ROE 340 non-null int64 8 Cash Ratio 340 non-null int64 9 Net Cash Flow 340 non-null int64 10 Net Income 340 non-null int64 11 Earnings Per Share 340 non-null float64 12 Estimated Shares Outstanding 340 non-null float64 13 P/E Ratio 340 non-null float64 14 P/B Ratio 340 non-null float64 dtypes: category(4), float64(7), int64(4) memory usage: 58.1 KB
All the object variables are converted to categorical variables.
data_new.describe(include="all").T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Ticker Symbol | 340 | 340 | AAL | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Security | 340 | 340 | 3M Company | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| GICS Sector | 340 | 11 | Industrials | 53 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| GICS Sub Industry | 340 | 104 | Oil & Gas Exploration & Production | 16 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Current Price | 340.0 | NaN | NaN | NaN | 80.862345 | 98.055086 | 4.5 | 38.555 | 59.705 | 92.880001 | 1274.949951 |
| Price Change | 340.0 | NaN | NaN | NaN | 4.078194 | 12.006338 | -47.129693 | -0.939484 | 4.819505 | 10.695493 | 55.051683 |
| Volatility | 340.0 | NaN | NaN | NaN | 1.525976 | 0.591798 | 0.733163 | 1.134878 | 1.385593 | 1.695549 | 4.580042 |
| ROE | 340.0 | NaN | NaN | NaN | 39.597059 | 96.547538 | 1.0 | 9.75 | 15.0 | 27.0 | 917.0 |
| Cash Ratio | 340.0 | NaN | NaN | NaN | 70.023529 | 90.421331 | 0.0 | 18.0 | 47.0 | 99.0 | 958.0 |
| Net Cash Flow | 340.0 | NaN | NaN | NaN | 55537620.588235 | 1946365312.175789 | -11208000000.0 | -193906500.0 | 2098000.0 | 169810750.0 | 20764000000.0 |
| Net Income | 340.0 | NaN | NaN | NaN | 1494384602.941176 | 3940150279.327937 | -23528000000.0 | 352301250.0 | 707336000.0 | 1899000000.0 | 24442000000.0 |
| Earnings Per Share | 340.0 | NaN | NaN | NaN | 2.776662 | 6.587779 | -61.2 | 1.5575 | 2.895 | 4.62 | 50.09 |
| Estimated Shares Outstanding | 340.0 | NaN | NaN | NaN | 577028337.754029 | 845849595.417695 | 27672156.86 | 158848216.1 | 309675137.8 | 573117457.325 | 6159292035.0 |
| P/E Ratio | 340.0 | NaN | NaN | NaN | 32.612563 | 44.348731 | 2.935451 | 15.044653 | 20.819876 | 31.764755 | 528.039074 |
| P/B Ratio | 340.0 | NaN | NaN | NaN | -1.718249 | 13.966912 | -76.119077 | -4.352056 | -1.06717 | 3.917066 | 129.064585 |
data_new.isna().sum()
Ticker Symbol 0 Security 0 GICS Sector 0 GICS Sub Industry 0 Current Price 0 Price Change 0 Volatility 0 ROE 0 Cash Ratio 0 Net Cash Flow 0 Net Income 0 Earnings Per Share 0 Estimated Shares Outstanding 0 P/E Ratio 0 P/B Ratio 0 dtype: int64
There are no missing values.
data_new.isnull().sum()
Ticker Symbol 0 Security 0 GICS Sector 0 GICS Sub Industry 0 Current Price 0 Price Change 0 Volatility 0 ROE 0 Cash Ratio 0 Net Cash Flow 0 Net Income 0 Earnings Per Share 0 Estimated Shares Outstanding 0 P/E Ratio 0 P/B Ratio 0 dtype: int64
There are no null Values in the given dataset.
data_new.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 340 entries, 0 to 339 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Ticker Symbol 340 non-null category 1 Security 340 non-null category 2 GICS Sector 340 non-null category 3 GICS Sub Industry 340 non-null category 4 Current Price 340 non-null float64 5 Price Change 340 non-null float64 6 Volatility 340 non-null float64 7 ROE 340 non-null int64 8 Cash Ratio 340 non-null int64 9 Net Cash Flow 340 non-null int64 10 Net Income 340 non-null int64 11 Earnings Per Share 340 non-null float64 12 Estimated Shares Outstanding 340 non-null float64 13 P/E Ratio 340 non-null float64 14 P/B Ratio 340 non-null float64 dtypes: category(4), float64(7), int64(4) memory usage: 58.1 KB
Questions:
df = data_new.copy()
# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
# selecting numerical columns
numerical_column = df.select_dtypes(include=np.number).columns.tolist()
for feature in numerical_column:
histogram_boxplot(df, feature)
fig, axes = plt.subplots(3, 2, figsize=(20, 15))
fig.suptitle("CDF plot of numerical variables", fontsize=20)
counter = 0
for ii in range(3):
sns.ecdfplot(ax=axes[ii][0], x=df[numerical_column[counter]])
counter = counter + 1
if counter != 5:
sns.ecdfplot(ax=axes[ii][1], x=df[numerical_column[counter]])
counter = counter + 1
else:
pass
fig.tight_layout(pad=2.0)
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
# let's explore discounts further
labeled_barplot(df, "GICS Sector", perc=True)
plt.figure(figsize=(15, 7))
sns.heatmap(
df[numerical_column].corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral"
)
plt.show()
a. Volatility and Current Price, Price Change, Net Cash Flow, Net Income, Earnings Per Share and Estimated Shares Outstanding. b. Current Price and ROE, Net Cash Flow. c. Price Change and ROE. d. ROE has negative correlation with all except Volatility and P/E Ratio. e. Net Cash FLow is negatively correlated with current price, Volatility and Estimated shares outstanding. f. Earnings per share and Estimated shares outstanding, P/E Ratio. g. P/E Ratio is negatively correlated with Price Change, Net Income, Earnings Per Change and Estimated shares outstanding. showing, if one value increases the other decreases.
sns.pairplot(data=df[numerical_column], diag_kind="kde")
plt.show()
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="Current Price", data=df)
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="Price Change", data=df)
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="Volatility", data=df)
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="ROE", data=df)
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="Cash Ratio", data=df)
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="Net Cash Flow", data=df)
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="Net Income", data=df)
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="Earnings Per Share", data=df)
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="Estimated Shares Outstanding", data=df)
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="P/E Ratio", data=df)
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="P/B Ratio", data=df)
histogram_boxplot(df, "Current Price")
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="Price Change", data=df)
plt.figure(figsize=(15, 7))
sns.heatmap(
df[numerical_column].corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral"
)
plt.show()
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="Cash Ratio", data=df)
High Cash Ratio is preferred, as it indicates that a company can easily pay its debt.
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="P/E Ratio", data=df)
# let's create a copy of the data
df1 = df.copy()
np.random.seed(1)
df1.sample(n=10) # Return a random sample of 10 rows from the dataframe 'data'
| Ticker Symbol | Security | GICS Sector | GICS Sub Industry | Current Price | Price Change | Volatility | ROE | Cash Ratio | Net Cash Flow | Net Income | Earnings Per Share | Estimated Shares Outstanding | P/E Ratio | P/B Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 102 | DVN | Devon Energy Corp. | Energy | Oil & Gas Exploration & Production | 32.000000 | -15.478079 | 2.923698 | 205 | 70 | 830000000 | -14454000000 | -35.55 | 4.065823e+08 | 93.089287 | 1.785616 |
| 125 | FB | Information Technology | Internet Software & Services | 104.660004 | 16.224320 | 1.320606 | 8 | 958 | 592000000 | 3669000000 | 1.31 | 2.800763e+09 | 79.893133 | 5.884467 | |
| 11 | AIV | Apartment Investment & Mgmt | Real Estate | REITs | 40.029999 | 7.578608 | 1.163334 | 15 | 47 | 21818000 | 248710000 | 1.52 | 1.636250e+08 | 26.335526 | -1.269332 |
| 248 | PG | Procter & Gamble | Consumer Staples | Personal Products | 79.410004 | 10.660538 | 0.806056 | 17 | 129 | 160383000 | 636056000 | 3.28 | 4.913916e+08 | 24.070121 | -2.256747 |
| 238 | OXY | Occidental Petroleum | Energy | Oil & Gas Exploration & Production | 67.610001 | 0.865287 | 1.589520 | 32 | 64 | -588000000 | -7829000000 | -10.23 | 7.652981e+08 | 93.089287 | 3.345102 |
| 336 | YUM | Yum! Brands Inc | Consumer Discretionary | Restaurants | 52.516175 | -8.698917 | 1.478877 | 142 | 27 | 159000000 | 1293000000 | 2.97 | 4.353535e+08 | 17.682214 | -3.838260 |
| 112 | EQT | EQT Corporation | Energy | Oil & Gas Exploration & Production | 52.130001 | -21.253771 | 2.364883 | 2 | 201 | 523803000 | 85171000 | 0.56 | 1.520911e+08 | 93.089287 | 9.567952 |
| 147 | HAL | Halliburton Co. | Energy | Oil & Gas Equipment & Services | 34.040001 | -5.101751 | 1.966062 | 4 | 189 | 7786000000 | -671000000 | -0.79 | 8.493671e+08 | 93.089287 | 17.345857 |
| 89 | DFS | Discover Financial Services | Financials | Consumer Finance | 53.619999 | 3.653584 | 1.159897 | 20 | 99 | 2288000000 | 2297000000 | 5.14 | 4.468872e+08 | 10.431906 | -0.375934 |
| 173 | IVZ | Invesco Ltd. | Financials | Asset Management & Custody Banks | 33.480000 | 7.067477 | 1.580839 | 12 | 67 | 412000000 | 968100000 | 2.26 | 4.283628e+08 | 14.814159 | 4.218620 |
df1.isnull().sum().sort_values(ascending=False)
# Return the count of missing values column-wise and sort them in descending order
Ticker Symbol 0 Security 0 GICS Sector 0 GICS Sub Industry 0 Current Price 0 Price Change 0 Volatility 0 ROE 0 Cash Ratio 0 Net Cash Flow 0 Net Income 0 Earnings Per Share 0 Estimated Shares Outstanding 0 P/E Ratio 0 P/B Ratio 0 dtype: int64
There are no missing values. Hence, there is no need for missing value treatment.
# Selecting duplicate rows except first
# occurrence based on all columns
duplicate_new = df1[df1.duplicated()]
duplicate_new
| Ticker Symbol | Security | GICS Sector | GICS Sub Industry | Current Price | Price Change | Volatility | ROE | Cash Ratio | Net Cash Flow | Net Income | Earnings Per Share | Estimated Shares Outstanding | P/E Ratio | P/B Ratio |
|---|
There are no duplicate rows
df1
| Ticker Symbol | Security | GICS Sector | GICS Sub Industry | Current Price | Price Change | Volatility | ROE | Cash Ratio | Net Cash Flow | Net Income | Earnings Per Share | Estimated Shares Outstanding | P/E Ratio | P/B Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AAL | American Airlines Group | Industrials | Airlines | 42.349998 | 9.999995 | 1.687151 | 135 | 51 | -604000000 | 7610000000 | 11.39 | 6.681299e+08 | 3.718174 | -8.784219 |
| 1 | ABBV | AbbVie | Health Care | Pharmaceuticals | 59.240002 | 8.339433 | 2.197887 | 130 | 77 | 51000000 | 5144000000 | 3.15 | 1.633016e+09 | 18.806350 | -8.750068 |
| 2 | ABT | Abbott Laboratories | Health Care | Health Care Equipment | 44.910000 | 11.301121 | 1.273646 | 21 | 67 | 938000000 | 4423000000 | 2.94 | 1.504422e+09 | 15.275510 | -0.394171 |
| 3 | ADBE | Adobe Systems Inc | Information Technology | Application Software | 93.940002 | 13.977195 | 1.357679 | 9 | 180 | -240840000 | 629551000 | 1.26 | 4.996437e+08 | 74.555557 | 4.199651 |
| 4 | ADI | Analog Devices, Inc. | Information Technology | Semiconductors | 55.320000 | -1.827858 | 1.701169 | 14 | 272 | 315120000 | 696878000 | 0.31 | 2.247994e+09 | 178.451613 | 1.059810 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 335 | YHOO | Yahoo Inc. | Information Technology | Internet Software & Services | 33.259998 | 14.887727 | 1.845149 | 15 | 459 | -1032187000 | -4359082000 | -4.64 | 9.394573e+08 | 28.976191 | 6.261775 |
| 336 | YUM | Yum! Brands Inc | Consumer Discretionary | Restaurants | 52.516175 | -8.698917 | 1.478877 | 142 | 27 | 159000000 | 1293000000 | 2.97 | 4.353535e+08 | 17.682214 | -3.838260 |
| 337 | ZBH | Zimmer Biomet Holdings | Health Care | Health Care Equipment | 102.589996 | 9.347683 | 1.404206 | 1 | 100 | 376000000 | 147000000 | 0.78 | 1.884615e+08 | 131.525636 | -23.884449 |
| 338 | ZION | Zions Bancorp | Financials | Regional Banks | 27.299999 | -1.158588 | 1.468176 | 4 | 99 | -43623000 | 309471000 | 1.20 | 2.578925e+08 | 22.749999 | -0.063096 |
| 339 | ZTS | Zoetis | Health Care | Pharmaceuticals | 47.919998 | 16.678836 | 1.610285 | 32 | 65 | 272000000 | 339000000 | 0.68 | 4.985294e+08 | 70.470585 | 1.723068 |
340 rows × 15 columns
# Lets look at the statistical summary of the data
df1.describe(include="all").T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Ticker Symbol | 340 | 340 | AAL | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Security | 340 | 340 | 3M Company | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| GICS Sector | 340 | 11 | Industrials | 53 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| GICS Sub Industry | 340 | 104 | Oil & Gas Exploration & Production | 16 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Current Price | 340.0 | NaN | NaN | NaN | 80.862345 | 98.055086 | 4.5 | 38.555 | 59.705 | 92.880001 | 1274.949951 |
| Price Change | 340.0 | NaN | NaN | NaN | 4.078194 | 12.006338 | -47.129693 | -0.939484 | 4.819505 | 10.695493 | 55.051683 |
| Volatility | 340.0 | NaN | NaN | NaN | 1.525976 | 0.591798 | 0.733163 | 1.134878 | 1.385593 | 1.695549 | 4.580042 |
| ROE | 340.0 | NaN | NaN | NaN | 39.597059 | 96.547538 | 1.0 | 9.75 | 15.0 | 27.0 | 917.0 |
| Cash Ratio | 340.0 | NaN | NaN | NaN | 70.023529 | 90.421331 | 0.0 | 18.0 | 47.0 | 99.0 | 958.0 |
| Net Cash Flow | 340.0 | NaN | NaN | NaN | 55537620.588235 | 1946365312.175789 | -11208000000.0 | -193906500.0 | 2098000.0 | 169810750.0 | 20764000000.0 |
| Net Income | 340.0 | NaN | NaN | NaN | 1494384602.941176 | 3940150279.327937 | -23528000000.0 | 352301250.0 | 707336000.0 | 1899000000.0 | 24442000000.0 |
| Earnings Per Share | 340.0 | NaN | NaN | NaN | 2.776662 | 6.587779 | -61.2 | 1.5575 | 2.895 | 4.62 | 50.09 |
| Estimated Shares Outstanding | 340.0 | NaN | NaN | NaN | 577028337.754029 | 845849595.417695 | 27672156.86 | 158848216.1 | 309675137.8 | 573117457.325 | 6159292035.0 |
| P/E Ratio | 340.0 | NaN | NaN | NaN | 32.612563 | 44.348731 | 2.935451 | 15.044653 | 20.819876 | 31.764755 | 528.039074 |
| P/B Ratio | 340.0 | NaN | NaN | NaN | -1.718249 | 13.966912 | -76.119077 | -4.352056 | -1.06717 | 3.917066 | 129.064585 |
category = ["Security", "GICS Sector", "GICS Sub Industry"]
for column in category:
print(df1[column].value_counts())
print("_" * 40)
3M Company 1
Newmont Mining Corp. (Hldg. Co.) 1
ONEOK 1
O'Reilly Automotive 1
Nucor Corp. 1
..
EQT Corporation 1
EOG Resources 1
E*Trade 1
Dun & Bradstreet 1
eBay Inc. 1
Name: Security, Length: 340, dtype: int64
________________________________________
Industrials 53
Financials 49
Consumer Discretionary 40
Health Care 40
Information Technology 33
Energy 30
Real Estate 27
Utilities 24
Materials 20
Consumer Staples 19
Telecommunications Services 5
Name: GICS Sector, dtype: int64
________________________________________
Oil & Gas Exploration & Production 16
REITs 14
Industrial Conglomerates 14
Internet Software & Services 12
Electric Utilities 12
MultiUtilities 11
Health Care Equipment 11
Banks 10
Property & Casualty Insurance 8
Diversified Financial Services 7
Biotechnology 7
Pharmaceuticals 6
Semiconductors 6
Packaged Foods & Meats 6
Oil & Gas Refining & Marketing & Transportation 6
Diversified Chemicals 5
Consumer Finance 5
Industrial Machinery 5
Health Care Facilities 5
Airlines 5
Integrated Oil & Gas 5
Managed Health Care 5
Retail REITs 4
Hotels, Resorts & Cruise Lines 4
Aerospace & Defense 4
Integrated Telecommunications Services 4
Building Products 4
Residential REITs 4
Railroads 4
Internet & Direct Marketing Retail 4
Specialty Chemicals 4
Soft Drinks 4
Research & Consulting Services 4
Asset Management & Custody Banks 4
Insurance Brokers 3
Regional Banks 3
Life & Health Insurance 3
IT Consulting & Other Services 3
Household Products 3
Oil & Gas Equipment & Services 3
Health Care Distributors 3
Restaurants 3
Air Freight & Logistics 3
Specialized REITs 3
Specialty Stores 3
Construction & Farm Machinery & Heavy Trucks 3
Cable & Satellite 3
Health Care Supplies 2
Tobacco 2
Paper Packaging 2
Application Software 2
Auto Parts & Equipment 2
Automobile Manufacturers 2
Leisure Products 2
Investment Banking & Brokerage 2
Broadcasting & Cable TV 2
Advertising 2
Construction Materials 2
Data Processing & Outsourced Services 2
Electronic Components 2
Homebuilding 2
Fertilizers & Agricultural Chemicals 2
Technology Hardware, Storage & Peripherals 1
Thrifts & Mortgage Finance 1
Steel 1
Specialty Retail 1
Publishing 1
Real Estate Services 1
Tires & Rubber 1
Personal Products 1
Semiconductor Equipment 1
Trucking 1
Technology, Hardware, Software and Supplies 1
Industrial Gases 1
Office REITs 1
Networking Equipment 1
Agricultural Products 1
Alternative Carriers 1
Apparel, Accessories & Luxury Goods 1
Brewers 1
Casinos & Gaming 1
Computer Hardware 1
Consumer Electronics 1
Copper 1
Distributors 1
Diversified Commercial Services 1
Drug Retail 1
Electrical Components & Equipment 1
Electronic Equipment & Instruments 1
Environmental Services 1
Financial Exchanges & Data 1
Gold 1
Home Entertainment Software 1
Home Furnishings 1
Household Appliances 1
Housewares & Specialties 1
Human Resource & Employment Services 1
Industrial Materials 1
Life Sciences Tools & Services 1
Metal & Glass Containers 1
Motorcycle Manufacturers 1
Multi-Sector Holdings 1
Multi-line Insurance 1
Water Utilities 1
Name: GICS Sub Industry, dtype: int64
________________________________________
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 340 entries, 0 to 339 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Ticker Symbol 340 non-null category 1 Security 340 non-null category 2 GICS Sector 340 non-null category 3 GICS Sub Industry 340 non-null category 4 Current Price 340 non-null float64 5 Price Change 340 non-null float64 6 Volatility 340 non-null float64 7 ROE 340 non-null int64 8 Cash Ratio 340 non-null int64 9 Net Cash Flow 340 non-null int64 10 Net Income 340 non-null int64 11 Earnings Per Share 340 non-null float64 12 Estimated Shares Outstanding 340 non-null float64 13 P/E Ratio 340 non-null float64 14 P/B Ratio 340 non-null float64 dtypes: category(4), float64(7), int64(4) memory usage: 58.1 KB
# outlier detection using boxplot
numeric_columns = df1.select_dtypes(include=np.number).columns.tolist()
# let's plot the boxplots of all columns to check for outliers
plt.figure(figsize=(20, 30))
for i, variable in enumerate(numeric_columns):
plt.subplot(5, 4, i + 1)
plt.boxplot(df1[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
# Outlier treatment
def treatment_outliers(df1, col):
Q1 = df1[col].quantile(0.25) # 25th quantile
Q3 = df1[col].quantile(0.75) # 75th quantile
IQR = Q3 - Q1
Lower_Whisker = Q1 - 1.5 * IQR
Upper_Whisker = Q3 + 1.5 * IQR
# all the values smaller than Lower_Whisker will be assigned the value of Lower_Whisker
# all the values greater than Upper_Whisker will be assigned the value of Upper_Whisker
df1[col] = np.clip(df1[col], Lower_Whisker, Upper_Whisker)
return df1
def treat_all_outliers(df1, col_list):
for c in col_list:
df1 = treatment_outliers(df1, c)
return df1
# treating the outliers
numerical_col = df1.select_dtypes(include=np.number).columns.tolist()
df1 = treat_all_outliers(df1, numerical_col)
# let's look at the boxplots to see if the outliers have been treated or not
plt.figure(figsize=(20, 30))
for i, variable in enumerate(numeric_columns):
plt.subplot(5, 4, i + 1)
plt.boxplot(df1[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
The Outliers are treated.
# variables used for clustering
numerical_column
['Current Price', 'Price Change', 'Volatility', 'ROE', 'Cash Ratio', 'Net Cash Flow', 'Net Income', 'Earnings Per Share', 'Estimated Shares Outstanding', 'P/E Ratio', 'P/B Ratio']
# scaling the dataset before performing clustering
scaler = StandardScaler()
subset = df1[numerical_column].copy()
subset_scaled = scaler.fit_transform(subset)
# creating a dataframe of the above scaled columns
subset_scaled_df = pd.DataFrame(subset_scaled, columns=subset.columns)
subset_scaled_df_HC = subset_scaled_df.copy()
subset_scaled_df
| Current Price | Price Change | Volatility | ROE | Cash Ratio | Net Cash Flow | Net Income | Earnings Per Share | Estimated Shares Outstanding | P/E Ratio | P/B Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.650712 | 0.549287 | 0.448212 | 2.116831 | -0.207311 | -1.433763 | 1.977138 | 2.114902 | 0.645054 | -1.493768 | -0.997339 |
| 1 | -0.257962 | 0.387909 | 1.568844 | 2.116831 | 0.238584 | 0.133007 | 1.977138 | 0.017981 | 2.124799 | -0.462139 | -0.992836 |
| 2 | -0.591183 | 0.675734 | -0.459081 | 0.031532 | 0.067086 | 1.722229 | 1.977138 | -0.054640 | 2.124799 | -0.703554 | 0.108970 |
| 3 | 0.548932 | 0.935802 | -0.274700 | -0.753522 | 2.005016 | -0.565079 | -0.371735 | -0.635605 | 0.171420 | 2.138684 | 0.714710 |
| 4 | -0.349115 | -0.600175 | 0.478969 | -0.426416 | 2.699583 | 0.764786 | -0.327678 | -0.964127 | 2.124799 | 2.138684 | 0.300691 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 335 | -0.862086 | 1.024290 | 0.794883 | -0.360995 | 2.699583 | -1.757838 | -2.071339 | -2.121302 | 1.407785 | 0.233207 | 0.986620 |
| 336 | -0.414314 | -1.267923 | -0.008772 | 2.116831 | -0.618907 | 0.391344 | 0.062409 | -0.044265 | -0.009307 | -0.539000 | -0.345167 |
| 337 | 0.750073 | 0.485894 | -0.172613 | -1.276891 | 0.633030 | 0.910412 | -0.687504 | -0.801595 | -0.703349 | 2.138684 | -2.048462 |
| 338 | -1.000676 | -0.535134 | -0.032253 | -1.080627 | 0.615880 | -0.093333 | -0.581187 | -0.656354 | -0.508170 | -0.192499 | 0.152625 |
| 339 | -0.521191 | 1.198354 | 0.279555 | 0.751165 | 0.032786 | 0.661642 | -0.561864 | -0.836176 | 0.168287 | 2.138684 | 0.388148 |
340 rows × 11 columns
subset_scaled_df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Current Price | 340.0 | -1.567374e-17 | 1.001474 | -1.530854 | -0.738959 | -0.247149 | 0.524283 | 2.419145 |
| Price Change | 340.0 | -5.485808e-17 | 1.001474 | -2.209917 | -0.513841 | 0.045833 | 0.616877 | 2.312954 |
| Volatility | 340.0 | -4.120887e-16 | 1.001474 | -1.644982 | -0.763558 | -0.213452 | 0.466638 | 2.311932 |
| ROE | 340.0 | -9.469549e-17 | 1.001474 | -1.276891 | -0.704456 | -0.360995 | 0.424059 | 2.116831 |
| Cash Ratio | 340.0 | 2.024524e-17 | 1.001474 | -1.081952 | -0.773255 | -0.275910 | 0.615880 | 2.699583 |
| Net Cash Flow | 340.0 | -3.689859e-17 | 1.001474 | -1.757838 | -0.452813 | 0.016033 | 0.417204 | 1.722229 |
| Net Income | 340.0 | -1.012262e-17 | 1.001474 | -2.071339 | -0.553160 | -0.320835 | 0.458959 | 1.977138 |
| Earnings Per Share | 340.0 | 1.208184e-17 | 1.001474 | -2.121302 | -0.532726 | -0.070201 | 0.526325 | 2.114902 |
| Estimated Shares Outstanding | 340.0 | 8.449124e-17 | 1.001474 | -1.155346 | -0.786595 | -0.362603 | 0.377963 | 2.124799 |
| P/E Ratio | 340.0 | -2.703720e-16 | 1.001474 | -1.547285 | -0.719339 | -0.324468 | 0.423870 | 2.138684 |
| P/B Ratio | 340.0 | 6.465416e-17 | 1.001474 | -2.048462 | -0.412916 | 0.020228 | 0.677448 | 2.312994 |
The Data is scaled and is now ready for clustering.
df2 = df1.copy()
# selecting numerical columns
numerical_column = df2.select_dtypes(include=np.number).columns.tolist()
for feature in numerical_column:
histogram_boxplot(df2, feature)
After outlier treatment and scaling,
fig, axes = plt.subplots(3, 2, figsize=(20, 15))
fig.suptitle("CDF plot of numerical variables", fontsize=20)
counter = 0
for ii in range(3):
sns.ecdfplot(ax=axes[ii][0], x=df2[numerical_column[counter]])
counter = counter + 1
if counter != 5:
sns.ecdfplot(ax=axes[ii][1], x=df2[numerical_column[counter]])
counter = counter + 1
else:
pass
fig.tight_layout(pad=2.0)
# let's explore GICS Sector
labeled_barplot(df2, "GICS Sector", perc=True)
plt.figure(figsize=(15, 7))
sns.heatmap(
df2[numerical_column].corr(),
annot=True,
vmin=-1,
vmax=1,
fmt=".2f",
cmap="Spectral",
)
plt.show()
Volatility and Current Price, Price Change, Net Cash Flow, Net Income, Earnings Per Share and Estimated Shares Outstanding. Current Price and ROE, Net Cash Flow. Price Change and ROE. ROE has negative correlation with all except Volatility and P/E Ratio. Net Cash FLow is negatively correlated with current price, Volatility and Estimated shares outstanding. Earnings per share and Estimated shares outstanding, P/E Ratio. P/E Ratio is negatively correlated with Price Change, Net Income, Earnings Per Change and Estimated shares outstanding. showing, if one value increases the other decreases.
sns.pairplot(data=df2[numerical_column], diag_kind="kde")
plt.show()
histogram_boxplot(df2, "Current Price")
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="Price Change", data=df2)
plt.figure(figsize=(15, 7))
sns.heatmap(
df2[numerical_column].corr(),
annot=True,
vmin=-1,
vmax=1,
fmt=".2f",
cmap="Spectral",
)
plt.show()
Volatility and Current Price, Price Change, Net Cash Flow, Net Income, Earnings Per Share and Estimated Shares Outstanding. Current Price and ROE, Net Cash Flow. Price Change and ROE. ROE has negative correlation with all except Volatility and P/E Ratio. Net Cash FLow is negatively correlated with current price, Volatility and Estimated shares outstanding. Earnings per share and Estimated shares outstanding, P/E Ratio. P/E Ratio is negatively correlated with Price Change, Net Income, Earnings Per Change and Estimated shares outstanding. showing, if one value increases the other decreases.
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="Cash Ratio", data=df2)
plt.figure(figsize=(15, 5))
ax = sns.barplot(x="GICS Sector", y="P/E Ratio", data=df2)
clusters = range(1, 9)
meanDistortions = []
for k in clusters:
model = KMeans(n_clusters=k)
model.fit(subset_scaled_df)
prediction = model.predict(subset_scaled_df)
distortion = (
sum(
np.min(cdist(subset_scaled_df, model.cluster_centers_, "euclidean"), axis=1)
)
/ subset_scaled_df.shape[0]
)
meanDistortions.append(distortion)
print("Number of Clusters:", k, "\tAverage Distortion:", distortion)
plt.plot(clusters, meanDistortions, "bx-")
plt.xlabel("k")
plt.ylabel("Average distortion")
plt.title("Selecting k with the Elbow Method")
plt.show()
C:\Users\kwudali\Anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:881: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
Number of Clusters: 1 Average Distortion: 3.1482665386211908 Number of Clusters: 2 Average Distortion: 2.9491046912239587 Number of Clusters: 3 Average Distortion: 2.7586235347054844 Number of Clusters: 4 Average Distortion: 2.6324515003371367 Number of Clusters: 5 Average Distortion: 2.5349290565854523 Number of Clusters: 6 Average Distortion: 2.454942164539365 Number of Clusters: 7 Average Distortion: 2.3887833542845023 Number of Clusters: 8 Average Distortion: 2.328622793920396
Appropriate value of K seems to be 3 or 4.
sil_score = []
cluster_list = list(range(2, 10))
for n_clusters in cluster_list:
clusterer = KMeans(n_clusters=n_clusters)
preds = clusterer.fit_predict((subset_scaled_df))
# centers = clusterer.cluster_centers_
score = silhouette_score(subset_scaled_df, preds)
sil_score.append(score)
print("For n_clusters = {}, the silhouette score is {})".format(n_clusters, score))
plt.plot(cluster_list, sil_score)
plt.show()
For n_clusters = 2, the silhouette score is 0.12412829033019597) For n_clusters = 3, the silhouette score is 0.18319118832785253) For n_clusters = 4, the silhouette score is 0.16781552306465286) For n_clusters = 5, the silhouette score is 0.1454730568964312) For n_clusters = 6, the silhouette score is 0.14935671403139855) For n_clusters = 7, the silhouette score is 0.143602264425146) For n_clusters = 8, the silhouette score is 0.139151746765594) For n_clusters = 9, the silhouette score is 0.12869576696818302)
The higher the Silhouette score, the better the cluster. Silhouette score is highest for n_clusters = 3.
# finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(3, random_state=1))
visualizer.fit(subset_scaled_df)
visualizer.show()
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 340 Samples in 3 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
# finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(4, random_state=1))
visualizer.fit(subset_scaled_df)
visualizer.show()
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 340 Samples in 4 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
# finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(5, random_state=1))
visualizer.fit(subset_scaled_df)
visualizer.show()
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 340 Samples in 5 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
# finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(2, random_state=1))
visualizer.fit(subset_scaled_df)
visualizer.show()
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 340 Samples in 2 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
# finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(6, random_state=1))
visualizer.fit(subset_scaled_df)
visualizer.show()
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 340 Samples in 6 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
# finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(7, random_state=1))
visualizer.fit(subset_scaled_df)
visualizer.show()
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 340 Samples in 7 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
# finding optimal no. of clusters with silhouette coefficients
visualizer = SilhouetteVisualizer(KMeans(8, random_state=1))
visualizer.fit(subset_scaled_df)
visualizer.show()
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 340 Samples in 8 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
# let's take 6 as number of clusters
kmeans = KMeans(n_clusters=6, random_state=0)
kmeans.fit(subset_scaled_df)
KMeans(n_clusters=6, random_state=0)
# adding kmeans cluster labels to the original and scaled dataframes
df2["K_means_segments"] = kmeans.labels_
subset_scaled_df["K_means_segments"] = kmeans.labels_
cluster_profile = df2.groupby("K_means_segments").mean()
cluster_profile["count_in_each_segments"] = (
df2.groupby("K_means_segments")["GICS Sector"].count().values
)
# let's display cluster profiles
cluster_profile.style.highlight_max(color="lightgreen", axis=0)
| Current Price | Price Change | Volatility | ROE | Cash Ratio | Net Cash Flow | Net Income | Earnings Per Share | Estimated Shares Outstanding | P/E Ratio | P/B Ratio | count_in_each_segments | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| K_means_segments | ||||||||||||
| 0 | 95.019096 | 10.245215 | 1.716429 | 19.071429 | 178.285714 | 271351160.714286 | 629838407.142857 | 2.604036 | 428507493.542500 | 37.737244 | 6.686040 | 35 |
| 1 | 51.637612 | 3.835364 | 1.381319 | 15.944245 | 43.856115 | -30510651.978417 | 636002115.107914 | 2.372698 | 280122073.373327 | 22.736805 | -2.211137 | 139 |
| 2 | 128.920679 | 5.416849 | 1.322512 | 25.694915 | 39.440678 | -33732309.322034 | 1217143358.050848 | 5.945911 | 191628023.559322 | 24.182609 | -5.033461 | 59 |
| 3 | 60.933429 | 7.939730 | 1.342802 | 25.207143 | 74.842857 | 600877307.142857 | 3498474589.285714 | 4.338429 | 907889427.659286 | 16.771349 | -1.684837 | 35 |
| 4 | 32.952294 | -10.893319 | 2.389966 | 25.500000 | 47.933333 | -95968745.833333 | -901092008.333333 | -1.479667 | 482668071.978833 | 46.315921 | 1.590948 | 30 |
| 5 | 63.869928 | 7.521379 | 1.318429 | 22.122024 | 64.988095 | -547221300.595238 | 3083802723.214286 | 4.145506 | 896403522.160416 | 19.232600 | -0.796296 | 42 |
fig, axes = plt.subplots(1, 5, figsize=(16, 6))
fig.suptitle("Boxplot of scaled numerical variables for each cluster", fontsize=20)
counter = 0
for ii in range(5):
sns.boxplot(
ax=axes[ii],
y=subset_scaled_df[numerical_column[counter]],
x=subset_scaled_df["K_means_segments"],
)
counter = counter + 1
fig.tight_layout(pad=2.0)
fig, axes = plt.subplots(1, 5, figsize=(16, 6))
fig.suptitle("Boxplot of original numerical variables for each cluster", fontsize=20)
counter = 0
for ii in range(5):
sns.boxplot(
ax=axes[ii], y=df2[numerical_column[counter]], x=df2["K_means_segments"]
)
counter = counter + 1
fig.tight_layout(pad=2.0)
df2.groupby("K_means_segments").mean().plot.bar(figsize=(15, 6))
<AxesSubplot:xlabel='K_means_segments'>
subset_scaled_df.groupby("K_means_segments").mean().plot.bar(figsize=(15, 6))
<AxesSubplot:xlabel='K_means_segments'>
pd.crosstab(df2.K_means_segments, df2["GICS Sector"]).style.highlight_max(
color="lightgreen", axis=0
)
| GICS Sector | Consumer Discretionary | Consumer Staples | Energy | Financials | Health Care | Industrials | Information Technology | Materials | Real Estate | Telecommunications Services | Utilities |
|---|---|---|---|---|---|---|---|---|---|---|---|
| K_means_segments | |||||||||||
| 0 | 4 | 2 | 1 | 1 | 10 | 0 | 12 | 3 | 1 | 1 | 0 |
| 1 | 21 | 7 | 0 | 22 | 10 | 26 | 7 | 6 | 19 | 1 | 20 |
| 2 | 10 | 3 | 1 | 7 | 10 | 11 | 2 | 7 | 6 | 0 | 2 |
| 3 | 4 | 3 | 1 | 9 | 7 | 5 | 4 | 0 | 0 | 1 | 1 |
| 4 | 0 | 0 | 23 | 0 | 0 | 2 | 2 | 2 | 1 | 0 | 0 |
| 5 | 1 | 4 | 4 | 10 | 3 | 9 | 6 | 2 | 0 | 2 | 1 |
# list of distance metrics
distance_metrics = ["euclidean", "chebyshev", "mahalanobis", "cityblock"]
# list of linkage methods
linkage_methods = ["single", "complete", "average", "weighted"]
high_cophenet_corr = 0
high_dm_lm = [0, 0]
for dm in distance_metrics:
for lm in linkage_methods:
Z = linkage(subset_scaled_df_HC, metric=dm, method=lm)
c, coph_dists = cophenet(Z, pdist(subset_scaled_df_HC))
print(
"Cophenetic correlation for {} distance and {} linkage is {}.".format(
dm.capitalize(), lm, c
)
)
if high_cophenet_corr < c:
high_cophenet_corr = c
high_dm_lm[0] = dm
high_dm_lm[1] = lm
Cophenetic correlation for Euclidean distance and single linkage is 0.6932108674136778. Cophenetic correlation for Euclidean distance and complete linkage is 0.6147798823663769. Cophenetic correlation for Euclidean distance and average linkage is 0.7325610568988988. Cophenetic correlation for Euclidean distance and weighted linkage is 0.6496438783746495. Cophenetic correlation for Chebyshev distance and single linkage is 0.7214024727237498. Cophenetic correlation for Chebyshev distance and complete linkage is 0.3680593124794425. Cophenetic correlation for Chebyshev distance and average linkage is 0.7011385715318806. Cophenetic correlation for Chebyshev distance and weighted linkage is 0.5032471223561586. Cophenetic correlation for Mahalanobis distance and single linkage is 0.6176064847170997. Cophenetic correlation for Mahalanobis distance and complete linkage is 0.38543044504439705. Cophenetic correlation for Mahalanobis distance and average linkage is 0.641587600792573. Cophenetic correlation for Mahalanobis distance and weighted linkage is 0.5433984553270202. Cophenetic correlation for Cityblock distance and single linkage is 0.6427079205007207. Cophenetic correlation for Cityblock distance and complete linkage is 0.5327604320031705. Cophenetic correlation for Cityblock distance and average linkage is 0.7279410646836356. Cophenetic correlation for Cityblock distance and weighted linkage is 0.6077902866991017.
# printing the combination of distance metric and linkage method with the highest cophenetic correlation
print(
"Highest cophenetic correlation is {}, which is obtained with {} distance and {} linkage.".format(
high_cophenet_corr, high_dm_lm[0].capitalize(), high_dm_lm[1]
)
)
Highest cophenetic correlation is 0.7325610568988988, which is obtained with Euclidean distance and average linkage.
# list of linkage methods
linkage_methods = ["single", "complete", "average", "centroid", "ward", "weighted"]
high_cophenet_corr = 0
high_dm_lm = [0, 0]
for lm in linkage_methods:
Z = linkage(subset_scaled_df_HC, metric="euclidean", method=lm)
c, coph_dists = cophenet(Z, pdist(subset_scaled_df_HC))
print("Cophenetic correlation for {} linkage is {}.".format(lm, c))
if high_cophenet_corr < c:
high_cophenet_corr = c
high_dm_lm[0] = "euclidean"
high_dm_lm[1] = lm
Cophenetic correlation for single linkage is 0.6932108674136778. Cophenetic correlation for complete linkage is 0.6147798823663769. Cophenetic correlation for average linkage is 0.7325610568988988. Cophenetic correlation for centroid linkage is 0.7294329307928588. Cophenetic correlation for ward linkage is 0.5753306117577638. Cophenetic correlation for weighted linkage is 0.6496438783746495.
# printing the combination of distance metric and linkage method with the highest cophenetic correlation
print(
"Highest cophenetic correlation is {}, which is obtained with {} linkage.".format(
high_cophenet_corr, high_dm_lm[1]
)
)
Highest cophenetic correlation is 0.7325610568988988, which is obtained with average linkage.
# list of linkage methods
linkage_methods = ["single", "complete", "average", "centroid", "ward", "weighted"]
# lists to save results of cophenetic correlation calculation
compare_cols = ["Linkage", "Cophenetic Coefficient"]
compare = []
# to create a subplot image
fig, axs = plt.subplots(len(linkage_methods), 1, figsize=(15, 30))
# We will enumerate through the list of linkage methods above
# For each linkage method, we will plot the dendrogram and calculate the cophenetic correlation
for i, method in enumerate(linkage_methods):
Z = linkage(subset_scaled_df_HC, metric="euclidean", method=method)
dendrogram(Z, ax=axs[i])
axs[i].set_title(f"Dendrogram ({method.capitalize()} Linkage)")
coph_corr, coph_dist = cophenet(Z, pdist(subset_scaled_df_HC))
axs[i].annotate(
f"Cophenetic\nCorrelation\n{coph_corr:0.2f}",
(0.80, 0.80),
xycoords="axes fraction",
)
compare.append([method, coph_corr])
HCmodel = AgglomerativeClustering(n_clusters=6, affinity="euclidean", linkage="average")
HCmodel.fit(subset_scaled_df_HC)
AgglomerativeClustering(linkage='average', n_clusters=6)
# adding hierarchical cluster labels to the original and scaled dataframes
subset_scaled_df_HC["HC_Clusters"] = HCmodel.labels_
df2["HC_Clusters"] = HCmodel.labels_
cluster_profile = df2.groupby("HC_Clusters").mean()
cluster_profile["count_in_each_segments"] = (
df2.groupby("HC_Clusters")["Current Price"].count().values
)
for c1 in df2["HC_Clusters"].unique():
print("In cluster {}, the following countries are present:".format(c1))
print(df2[df2["HC_Clusters"] == c1]["Security"].unique())
print()
In cluster 1, the following countries are present: ['American Airlines Group', 'AbbVie', 'Abbott Laboratories', 'Archer-Daniels-Midland Co', 'Alliance Data Systems', ..., 'Yahoo Inc.', 'Yum! Brands Inc', 'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis'] Length: 290 Categories (290, object): ['American Airlines Group', 'AbbVie', 'Abbott Laboratories', 'Archer-Daniels-Midland Co', ..., 'Yum! Brands Inc', 'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis'] In cluster 2, the following countries are present: ['Adobe Systems Inc', 'Analog Devices, Inc.', 'Alexion Pharmaceuticals', 'Amgen Inc', 'Amazon.com Inc', ..., 'First Solar Inc', 'Intuitive Surgical Inc.', 'Regeneron', 'TripAdvisor', 'Waters Corporation'] Length: 15 Categories (15, object): ['Adobe Systems Inc', 'Analog Devices, Inc.', 'Alexion Pharmaceuticals', 'Amgen Inc', ..., 'Intuitive Surgical Inc.', 'Regeneron', 'TripAdvisor', 'Waters Corporation'] In cluster 0, the following countries are present: ['Apache Corporation', 'Anadarko Petroleum Corp', 'Baker Hughes Inc', 'Chesapeake Energy', 'Cabot Oil & Gas', ..., 'Spectra Energy Corp.', 'Southwestern Energy', 'Teradata Corp.', 'Williams Cos.', 'Cimarex Energy'] Length: 31 Categories (31, object): ['Apache Corporation', 'Anadarko Petroleum Corp', 'Baker Hughes Inc', 'Chesapeake Energy', ..., 'Southwestern Energy', 'Teradata Corp.', 'Williams Cos.', 'Cimarex Energy'] In cluster 4, the following countries are present: ['Charter Communications'] Categories (1, object): ['Charter Communications'] In cluster 3, the following countries are present: ['Chipotle Mexican Grill'] Categories (1, object): ['Chipotle Mexican Grill'] In cluster 5, the following countries are present: ['Vertex Pharmaceuticals Inc', 'Wynn Resorts Ltd'] Categories (2, object): ['Vertex Pharmaceuticals Inc', 'Wynn Resorts Ltd']
HCmodel = AgglomerativeClustering(n_clusters=5, affinity="euclidean", linkage="ward")
HCmodel.fit(subset_scaled_df_HC)
AgglomerativeClustering(n_clusters=5)
# adding hierarchical cluster labels to the original and scaled dataframes
subset_scaled_df_HC["HC_Clusters_1"] = HCmodel.labels_
df2["HC_Clusters_1"] = HCmodel.labels_
cluster_profile_ward = df2.groupby("HC_Clusters_1").mean()
cluster_profile_ward["count_in_each_segments"] = (
df2.groupby("HC_Clusters_1")["Current Price"].count().values
)
for c1 in df2["HC_Clusters_1"].unique():
print("In cluster {}, the following countries are present:".format(c1))
print(df2[df2["HC_Clusters_1"] == c1]["Security"].unique())
print()
In cluster 2, the following countries are present: ['American Airlines Group', 'AbbVie', 'Abbott Laboratories', 'American International Group, Inc.', 'American Express Co', ..., 'United Parcel Service', 'United Technologies', 'Verizon Communications', 'Wells Fargo', 'Exxon Mobil Corp.'] Length: 43 Categories (43, object): ['American Airlines Group', 'AbbVie', 'Abbott Laboratories', 'American International Group, Inc.', ..., 'United Technologies', 'Verizon Communications', 'Wells Fargo', 'Exxon Mobil Corp.'] In cluster 1, the following countries are present: ['Adobe Systems Inc', 'Analog Devices, Inc.', 'Akamai Technologies Inc', 'Alexion Pharmaceuticals', 'Applied Materials Inc', ..., 'Vertex Pharmaceuticals Inc', 'Waters Corporation', 'Wynn Resorts Ltd', 'Xerox Corp.', 'Yahoo Inc.'] Length: 52 Categories (52, object): ['Adobe Systems Inc', 'Analog Devices, Inc.', 'Akamai Technologies Inc', 'Alexion Pharmaceuticals', ..., 'Waters Corporation', 'Wynn Resorts Ltd', 'Xerox Corp.', 'Yahoo Inc.'] In cluster 0, the following countries are present: ['Archer-Daniels-Midland Co', 'Ameren Corp', 'American Electric Power', 'AFLAC Inc', 'Apartment Investment & Mgmt', ..., 'Xylem Inc.', 'Yum! Brands Inc', 'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis'] Length: 197 Categories (197, object): ['Archer-Daniels-Midland Co', 'Ameren Corp', 'American Electric Power', 'AFLAC Inc', ..., 'Yum! Brands Inc', 'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis'] In cluster 4, the following countries are present: ['Alliance Data Systems', 'Alaska Air Group Inc', 'Boeing Company', 'BIOGEN IDEC Inc.', 'General Dynamics', ..., 'Sherwin-Williams', 'Simon Property Group Inc', 'The Travelers Companies Inc.', 'Tesoro Petroleum Co.', 'Valero Energy'] Length: 25 Categories (25, object): ['Alliance Data Systems', 'Alaska Air Group Inc', 'Boeing Company', 'BIOGEN IDEC Inc.', ..., 'Simon Property Group Inc', 'The Travelers Companies Inc.', 'Tesoro Petroleum Co.', 'Valero Energy'] In cluster 3, the following countries are present: ['Apache Corporation', 'Anadarko Petroleum Corp', 'Baker Hughes Inc', 'Chesapeake Energy', 'Cabot Oil & Gas', ..., 'Spectra Energy Corp.', 'Southwestern Energy', 'Teradata Corp.', 'Williams Cos.', 'Cimarex Energy'] Length: 23 Categories (23, object): ['Apache Corporation', 'Anadarko Petroleum Corp', 'Baker Hughes Inc', 'Chesapeake Energy', ..., 'Southwestern Energy', 'Teradata Corp.', 'Williams Cos.', 'Cimarex Energy']
# let's display cluster profiles
cluster_profile_ward.style.highlight_max(color="lightgreen", axis=0)
| Current Price | Price Change | Volatility | ROE | Cash Ratio | Net Cash Flow | Net Income | Earnings Per Share | Estimated Shares Outstanding | P/E Ratio | P/B Ratio | K_means_segments | HC_Clusters | count_in_each_segments | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| HC_Clusters_1 | ||||||||||||||
| 0 | 67.842897 | 4.165814 | 1.379430 | 17.510152 | 43.710660 | -22317788.705584 | 791304172.588832 | 3.002418 | 274869321.770762 | 23.712726 | -3.088535 | 1.472081 | 1.005076 | 197 |
| 1 | 71.569176 | 9.460529 | 1.745668 | 17.067308 | 147.490385 | 136610663.461538 | 924025872.596154 | 2.246563 | 619951719.447115 | 32.907674 | 5.329730 | 1.423077 | 1.365385 | 52 |
| 2 | 65.316907 | 5.686257 | 1.236535 | 25.276163 | 67.127907 | 59225113.372093 | 3958289215.116279 | 4.497558 | 1012930930.315697 | 16.923639 | -4.105437 | 3.883721 | 1.000000 | 43 |
| 3 | 32.947827 | -11.234540 | 2.393770 | 31.478261 | 43.347826 | -136632809.782609 | -1356654576.086957 | -2.242609 | 473814086.308804 | 50.100914 | 0.689859 | 4.000000 | 0.000000 | 23 |
| 4 | 130.412701 | 7.182146 | 1.337093 | 33.130000 | 51.440000 | -147072915.000000 | 2570160510.000000 | 8.128350 | 332216165.291600 | 17.183748 | 3.078650 | 2.760000 | 1.000000 | 25 |
fig, axes = plt.subplots(1, 5, figsize=(16, 6))
fig.suptitle("Boxplot of scaled numerical variables for each cluster", fontsize=20)
counter = 0
for ii in range(5):
sns.boxplot(
ax=axes[ii],
y=subset_scaled_df_HC[numerical_column[counter]],
x=subset_scaled_df_HC["HC_Clusters_1"],
)
counter = counter + 1
fig.tight_layout(pad=2.0)
fig, axes = plt.subplots(1, 5, figsize=(16, 6))
fig.suptitle("Boxplot of original numerical variables for each cluster", fontsize=20)
counter = 0
for ii in range(5):
sns.boxplot(ax=axes[ii], y=df2[numerical_column[counter]], x=df2["HC_Clusters_1"])
counter = counter + 1
fig.tight_layout(pad=2.0)
Let's compare Cluster vs GICS Sector
pd.crosstab(df2.HC_Clusters_1, df2["GICS Sector"]).style.highlight_max(
color="lightgreen", axis=0
)
| GICS Sector | Consumer Discretionary | Consumer Staples | Energy | Financials | Health Care | Industrials | Information Technology | Materials | Real Estate | Telecommunications Services | Utilities |
|---|---|---|---|---|---|---|---|---|---|---|---|
| HC_Clusters_1 | |||||||||||
| 0 | 30 | 11 | 2 | 32 | 18 | 34 | 8 | 14 | 24 | 1 | 23 |
| 1 | 4 | 2 | 2 | 4 | 12 | 3 | 19 | 3 | 2 | 1 | 0 |
| 2 | 4 | 6 | 2 | 10 | 7 | 8 | 3 | 0 | 0 | 2 | 1 |
| 3 | 0 | 0 | 21 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
| 4 | 2 | 0 | 3 | 3 | 3 | 8 | 2 | 2 | 1 | 1 | 0 |
subset_scaled_df_HC.groupby("HC_Clusters_1").mean().plot.bar(figsize=(15, 6))
<AxesSubplot:xlabel='HC_Clusters_1'>
Hierarchical clustering took a lot of time to create the dendograms.
We have found companies with some very similar metrics as follows: